pip install pandas
Requirement already satisfied: pandas in c:\users\shrey\anaconda3\lib\site-packages (1.4.2) Requirement already satisfied: numpy>=1.18.5 in c:\users\shrey\anaconda3\lib\site-packages (from pandas) (1.21.5) Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\shrey\anaconda3\lib\site-packages (from pandas) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\shrey\anaconda3\lib\site-packages (from pandas) (2021.3) Requirement already satisfied: six>=1.5 in c:\users\shrey\anaconda3\lib\site-packages (from python-dateutil>=2.8.1->pandas) (1.16.0) Note: you may need to restart the kernel to use updated packages.
import pandas as pd
#load the dataset
df = pd.read_csv("C:/Users/shrey/OneDrive/Desktop/project/DataScience_salaries_2024.csv")
print(df.head())
work_year experience_level employment_type job_title \
0 2021 MI FT Data Scientist
1 2021 MI FT BI Data Analyst
2 2020 MI FT Data Scientist
3 2021 MI FT ML Engineer
4 2022 SE FT Lead Machine Learning Engineer
salary salary_currency salary_in_usd employee_residence remote_ratio \
0 30400000 CLP 40038 CL 100
1 11000000 HUF 36259 HU 50
2 11000000 HUF 35735 HU 50
3 8500000 JPY 77364 JP 50
4 7500000 INR 95386 IN 50
company_location company_size
0 CL L
1 US L
2 HU L
3 JP S
4 IN L
df
| work_year | experience_level | employment_type | job_title | salary | salary_currency | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2021 | MI | FT | Data Scientist | 30400000 | CLP | 40038 | CL | 100 | CL | L |
| 1 | 2021 | MI | FT | BI Data Analyst | 11000000 | HUF | 36259 | HU | 50 | US | L |
| 2 | 2020 | MI | FT | Data Scientist | 11000000 | HUF | 35735 | HU | 50 | HU | L |
| 3 | 2021 | MI | FT | ML Engineer | 8500000 | JPY | 77364 | JP | 50 | JP | S |
| 4 | 2022 | SE | FT | Lead Machine Learning Engineer | 7500000 | INR | 95386 | IN | 50 | IN | L |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14833 | 2022 | MI | FT | Business Intelligence Developer | 15000 | USD | 15000 | GH | 100 | GH | M |
| 14834 | 2020 | EX | FT | Staff Data Analyst | 15000 | USD | 15000 | NG | 0 | CA | M |
| 14835 | 2021 | EN | FT | Machine Learning Developer | 15000 | USD | 15000 | TH | 100 | TH | L |
| 14836 | 2022 | EN | FT | Data Analyst | 15000 | USD | 15000 | ID | 0 | ID | L |
| 14837 | 2020 | EN | PT | ML Engineer | 14000 | EUR | 15966 | DE | 100 | DE | S |
14838 rows × 11 columns
summary_statistics= df.describe()
df.describe()
| work_year | salary | salary_in_usd | remote_ratio | |
|---|---|---|---|---|
| count | 14838.000000 | 1.483800e+04 | 14838.000000 | 14838.000000 |
| mean | 2023.138900 | 1.650227e+05 | 149874.718763 | 32.760480 |
| std | 0.700799 | 3.562354e+05 | 69009.181349 | 46.488278 |
| min | 2020.000000 | 1.400000e+04 | 15000.000000 | 0.000000 |
| 25% | 2023.000000 | 1.021000e+05 | 102000.000000 | 0.000000 |
| 50% | 2023.000000 | 1.422000e+05 | 141300.000000 | 0.000000 |
| 75% | 2024.000000 | 1.875000e+05 | 185900.000000 | 100.000000 |
| max | 2024.000000 | 3.040000e+07 | 800000.000000 | 100.000000 |
data_types_missing= df.info()
#there are no null values in the dataset
<class 'pandas.core.frame.DataFrame'> RangeIndex: 14838 entries, 0 to 14837 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 work_year 14838 non-null int64 1 experience_level 14838 non-null object 2 employment_type 14838 non-null object 3 job_title 14838 non-null object 4 salary 14838 non-null int64 5 salary_currency 14838 non-null object 6 salary_in_usd 14838 non-null int64 7 employee_residence 14838 non-null object 8 remote_ratio 14838 non-null int64 9 company_location 14838 non-null object 10 company_size 14838 non-null object dtypes: int64(4), object(7) memory usage: 1.2+ MB
print(df.nunique())
work_year 5 experience_level 4 employment_type 4 job_title 153 salary 2363 salary_currency 23 salary_in_usd 2730 employee_residence 88 remote_ratio 3 company_location 77 company_size 3 dtype: int64
#transformation of the codes of the categoricl variables
df['experience_level'] = df['experience_level'].replace({'SE': 'Expert', 'MI': 'Intermediate', 'EN': 'Junior', 'EX': 'Director'})
df['employment_type'] = df['employment_type'].replace({'FT': 'Full-time', 'CT': 'Contract', 'FL': 'Freelance', 'PT': 'Part-time'})
def country_name(country_code):
try:
return pycountry.countries.get(alpha_2=country_code).name
except:
return 'other'
df['company_location'] = df['company_location'].apply(country_name)
df['employee_residence'] = df['employee_residence'].apply(country_name)
# Categorical variables
for column in ['work_year','experience_level','employment_type','company_size','remote_ratio','job_title','company_location']:
print(df[column].unique())
[2021 2020 2022 2023 2024] ['Intermediate' 'Expert' 'Junior' 'Director'] ['Full-time' 'Freelance' 'Contract' 'Part-time'] ['L' 'S' 'M'] [100 50 0] ['Data Scientist' 'BI Data Analyst' 'ML Engineer' 'Lead Machine Learning Engineer' 'Data Science Manager' 'Head of Machine Learning' 'Research Engineer' 'Head of Data Science' 'AI Programmer' 'Machine Learning Engineer' 'Lead Data Scientist' 'Data Engineer' 'Applied Machine Learning Scientist' 'Lead Data Analyst' 'Data Analytics Manager' 'Data Integration Specialist' 'Principal Data Architect' 'NLP Engineer' 'Big Data Engineer' 'AI Research Engineer' 'Machine Learning Software Engineer' 'Data Analyst' 'Applied Data Scientist' 'AI Scientist' 'Data Analytics Lead' 'Business Data Analyst' 'Product Data Analyst' 'Computer Vision Engineer' 'Data Science Consultant' 'AI Architect' 'Analytics Engineer' 'Machine Learning Scientist' 'Research Scientist' 'Prompt Engineer' 'Principal Data Scientist' 'Applied Scientist' 'Deep Learning Engineer' 'Data Architect' 'AI Engineer' 'Data Infrastructure Engineer' 'Data Science' 'Director of Data Science' 'Data Science Tech Lead' 'BI Analyst' 'Data Lead' 'Head of Data' 'Analytics Engineering Manager' 'Data Product Owner' 'Business Intelligence' 'Machine Learning Infrastructure Engineer' 'Research Analyst' 'Managing Director Data Science' 'Finance Data Analyst' 'Data Operations Engineer' 'Robotics Software Engineer' 'MLOps Engineer' 'Lead Data Engineer' 'AI Developer' 'Data Science Lead' 'Data Science Engineer' 'Business Intelligence Manager' 'Business Intelligence Specialist' 'Business Intelligence Engineer' 'Data Modeler' 'AWS Data Architect' 'Machine Learning Researcher' 'Data Product Manager' 'Data Strategy Manager' 'ETL Developer' 'Cloud Data Architect' 'Computational Biologist' 'AI Software Engineer' 'Data Strategist' 'Data Manager' 'Business Intelligence Analyst' 'Applied Machine Learning Engineer' 'Data Specialist' 'AI Product Manager' 'Data Science Director' 'Software Data Engineer' 'Data Operations Specialist' 'BI Developer' 'Machine Learning Modeler' 'Decision Scientist' 'Data Management Specialist' 'Data Analyst Lead' 'Marketing Data Analyst' 'Machine Learning Manager' 'Principal Data Engineer' 'Data Science Practitioner' 'ML Ops Engineer' 'Principal Machine Learning Engineer' 'Data Quality Engineer' 'Cloud Database Engineer' 'Staff Machine Learning Engineer' 'Data Visualization Specialist' 'Data Scientist Lead' 'Robotics Engineer' 'Data Integration Developer' 'Data Developer' 'Machine Learning Developer' 'Staff Data Analyst' 'Business Intelligence Developer' 'Business Intelligence Lead' 'Data Pipeline Engineer' 'Data Quality Analyst' 'Data Visualization Engineer' 'Principal Data Analyst' 'Staff Data Scientist' 'Data Integration Engineer' 'Machine Learning Operations Engineer' 'Consultant Data Engineer' 'Cloud Data Engineer' 'Data Analytics Specialist' 'Data Operations Analyst' 'Data Science Analyst' 'AI Research Scientist' 'ETL Engineer' 'Computer Vision Software Engineer' 'Data Management Analyst' 'Data Operations Manager' 'Data Analytics Engineer' 'Data Visualization Analyst' 'Financial Data Analyst' 'Manager Data Management' 'Big Data Architect' 'Data Analytics Consultant' 'Autonomous Vehicle Technician' 'Big Data Developer' 'Deep Learning Researcher' 'Machine Learning Research Engineer' 'Data Management Consultant' 'Azure Data Engineer' 'Business Intelligence Data Analyst' 'Data Analytics Associate' 'Data Reporting Analyst' 'Marketing Data Scientist' 'Data Modeller' 'Marketing Data Engineer' 'Data DevOps Engineer' 'Data Operations Associate' 'Machine Learning Specialist' 'Encounter Data Management Professional' 'Applied Research Scientist' 'Admin & Data Analyst' 'Power BI Developer' 'Compliance Data Analyst' 'Sales Data Analyst' 'BI Data Engineer' 'Data Quality Manager' 'Quantitative Research Analyst' 'Insight Analyst' 'CRM Data Analyst'] ['other']
import matplotlib.pyplot as plt
import seaborn as sns
# Extract the "job title" column
job_titles = df['job_title']
# Calculate the frequency of each job title
title_counts = job_titles.value_counts()
# Extract the top 20 most frequent job titles
top_20_titles = title_counts.head(20)
# Create a DataFrame for the top 20 titles
top_20_df = pd.DataFrame({'Job Title': top_20_titles.index, 'Count': top_20_titles.values})
# Plotting the count plot
plt.figure(figsize=(12, 6))
sns.set(style="darkgrid")
ax = sns.barplot(data=top_20_df, x='Count', y='Job Title', palette='cubehelix')
plt.xlabel('Count')
plt.ylabel('Job Titles')
plt.title('Top 20 Most Frequent Job Titles')
# Add count labels to the bars
for i, v in enumerate(top_20_df['Count']):
ax.text(v + 0.2, i, str(v), color='black', va='center')
plt.tight_layout()
plt.show()
#calculate the number of individuals in each experience level
level_counts = df['experience_level'].value_counts()
# Create a pie chart
plt.figure(figsize=(7,12),dpi=80)
plt.pie(level_counts.values, labels=level_counts.index, autopct='%1.1f%%')
plt.title('Experience Level Distribution')
plt.show()
# Create a cross-tabulation of the two columns
cross_tab = pd.crosstab(df['experience_level'], df['company_size'])
# Create a heatmap using the cross-tabulation data
plt.figure(figsize=(10, 8))
sns.heatmap(cross_tab, annot=True, fmt="d", cmap='Reds')
plt.xlabel('Company Size')
plt.ylabel('Experience Level')
plt.title('Relationship between Experience Level and Company Size')
plt.show()
import matplotlib.pyplot as plt
from matplotlib import ticker
# Create bar chart
average_salary = df.groupby('job_title')['salary_in_usd'].mean().sort_values(ascending=False)
top_ten_salaries = average_salary.head(10)
plt.figure(figsize=(15,10),dpi=80)
plt.bar(top_ten_salaries.index, top_ten_salaries)
# Add labels to the chart
plt.xlabel('Job')
plt.ylabel('Salary $')
plt.title('Average of the ten highest salaries by Job Titles')
plt.xticks(rotation=35, ha='right')
plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
import plotly.express as px
salary_by_country = df.groupby('company_location', as_index=False)['salary_in_usd'].mean()
fig = px.choropleth(salary_by_country,locations='company_location',locationmode='country names',color='salary_in_usd',
projection='equirectangular',hover_name='company_location',
labels={'salary_in_usd':'Average Salary in USD'},title='Distribution of average salary by company location')
fig.show("notebook")
common_jobs = ['Data Engineer', 'Data Scientist', 'Data Analyst', 'Machine Learning Engineer', 'Analytics Engineer','Research Scientist', 'Data Science Manager', 'Applied Scientist']
common_jobs = df[df['job_title'].isin(common_jobs)]
salary_common_jobs = common_jobs.groupby('job_title')['salary_in_usd'].mean().sort_values(ascending=False)
remote_common_jobs = common_jobs.groupby('job_title')['remote_ratio'].mean().sort_values(ascending=False)
salary_common_country = common_jobs.groupby('company_location')['salary_in_usd'].mean().sort_values(ascending=False)
# Create bar chart
salary_common_jobs = common_jobs.groupby('job_title')['salary_in_usd'].mean().sort_values(ascending=False)
plt.figure(figsize=(15,10),dpi=80)
plt.bar(salary_common_jobs.index, salary_common_jobs)
# Add labels to the chart
plt.xlabel('Job')
plt.ylabel('Salary $')
plt.title('Average salary for common Job Titles')
plt.xticks(rotation=20, ha='right')
plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
# Create bar chart
remote_common_jobs = common_jobs.groupby('job_title')['remote_ratio'].mean().sort_values(ascending=False)
plt.figure(figsize=(15,10),dpi=80)
plt.bar(remote_common_jobs.index, remote_common_jobs)
# Add labels to the chart
plt.xlabel('Job')
plt.ylabel('% remote')
plt.title('Remote rate by Job Titles')
plt.xticks(rotation=20, ha='right')
plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
# Distribution of experience_level
print(df['experience_level'].value_counts())
# Distribution of employment_type
print(df['employment_type'].value_counts())
# Distribution of company_size
print(df['company_size'].value_counts())
Expert 9696 Intermediate 3553 Junior 1148 Director 441 Name: experience_level, dtype: int64 Full-time 14772 Part-time 27 Contract 26 Freelance 13 Name: employment_type, dtype: int64 M 13674 L 983 S 181 Name: company_size, dtype: int64
# Average salary by experience level
print(df.groupby('experience_level')['salary_in_usd'].mean().sort_values(ascending=False))
experience_level Director 194730.210884 Expert 163700.967100 Intermediate 125386.553054 Junior 91656.841463 Name: salary_in_usd, dtype: float64
pip install matplotlib seaborn
Requirement already satisfied: matplotlib in c:\users\shrey\anaconda3\lib\site-packages (3.5.1) Requirement already satisfied: seaborn in c:\users\shrey\anaconda3\lib\site-packages (0.11.2) Requirement already satisfied: pillow>=6.2.0 in c:\users\shrey\anaconda3\lib\site-packages (from matplotlib) (9.0.1) Requirement already satisfied: pyparsing>=2.2.1 in c:\users\shrey\anaconda3\lib\site-packages (from matplotlib) (3.0.4) Requirement already satisfied: fonttools>=4.22.0 in c:\users\shrey\anaconda3\lib\site-packages (from matplotlib) (4.25.0) Requirement already satisfied: packaging>=20.0 in c:\users\shrey\anaconda3\lib\site-packages (from matplotlib) (21.3) Requirement already satisfied: cycler>=0.10 in c:\users\shrey\anaconda3\lib\site-packages (from matplotlib) (0.11.0) Requirement already satisfied: python-dateutil>=2.7 in c:\users\shrey\anaconda3\lib\site-packages (from matplotlib) (2.8.2) Requirement already satisfied: numpy>=1.17 in c:\users\shrey\anaconda3\lib\site-packages (from matplotlib) (1.21.5) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\shrey\anaconda3\lib\site-packages (from matplotlib) (1.3.2) Requirement already satisfied: pandas>=0.23 in c:\users\shrey\anaconda3\lib\site-packages (from seaborn) (1.4.2) Requirement already satisfied: scipy>=1.0 in c:\users\shrey\anaconda3\lib\site-packages (from seaborn) (1.7.3) Requirement already satisfied: pytz>=2020.1 in c:\users\shrey\anaconda3\lib\site-packages (from pandas>=0.23->seaborn) (2021.3) Requirement already satisfied: six>=1.5 in c:\users\shrey\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0) Note: you may need to restart the kernel to use updated packages.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Plotting average salary by experience level
plt.figure(figsize=(10, 6))
sns.barplot(x=df['experience_level'], y=df['salary_in_usd'], estimator=np.mean)
plt.xlabel('Experience Level')
plt.ylabel('Average Salary in USD')
plt.title('Average Salary by Experience Level')
plt.show()
# Correlation matrix
correlation_matrix = df.corr()
# Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()
# Bar chart for average salary by experience level
plt.figure(figsize=(10, 6))
sns.barplot(x='experience_level', y='salary_in_usd', data=df, estimator=np.mean, ci=None)
plt.title('Average Salary by Experience Level')
plt.xlabel('Experience Level')
plt.ylabel('Average Salary in USD')
plt.show()